{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "JHDJLKDuJkcB"
   },
   "source": [
    "# Example for using word2vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "9m8ChZsdAx41"
   },
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "try:\n",
    "  import textaugment, gensim\n",
    "except ModuleNotFoundError:\n",
    "  !pip -q install textaugment gensim\n",
    "  import textaugment, gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 153
    },
    "colab_type": "code",
    "id": "ux6Bc4QSrYA8",
    "outputId": "9f2b8af1-3b22-455c-dd85-d1ac173a5317"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
      "[nltk_data]   Unzipping corpora/wordnet.zip.\n",
      "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
      "[nltk_data]   Unzipping tokenizers/punkt.zip.\n",
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     /root/nltk_data...\n",
      "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Import NLRK and download data\n",
    "import nltk\n",
    "nltk.download(['wordnet','punkt','averaged_perceptron_tagger'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "8AUt-F5MtiuI"
   },
   "source": [
    "## Load Google Word2vec embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 204
    },
    "colab_type": "code",
    "id": "1xq4dJtSr4RM",
    "outputId": "1ff32743-04a9-4b8a-eda3-8dcf55e711ca"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2020-05-23 18:06:47--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.178.197\n",
      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.178.197|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 1647046227 (1.5G) [application/x-gzip]\n",
      "Saving to: ‘GoogleNews-vectors-negative300.bin.gz’\n",
      "\n",
      "GoogleNews-vectors- 100%[===================>]   1.53G  36.2MB/s    in 44s     \n",
      "\n",
      "2020-05-23 18:07:31 (35.7 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Download Google Word2vec embeddings\n",
    "!wget \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 71
    },
    "colab_type": "code",
    "id": "q2wxTNhwrjK-",
    "outputId": "e30ff6b7-96a3-4d59-c486-65def436cbd8"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:253: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
      "  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
     ]
    }
   ],
   "source": [
    "model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 71
    },
    "colab_type": "code",
    "id": "3uHnRL77uATl",
    "outputId": "de09c7ff-47bc-4e21-d2eb-89fcadb4d2bd"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
      "  if np.issubdtype(vec.dtype, np.int):\n"
     ]
    }
   ],
   "source": [
    "from textaugment import Word2vec\n",
    "t = Word2vec(model=model)\n",
    "output = t.augment('The stories are good')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "BhVYt8V3uAwk",
    "outputId": "7c36d302-db66-4837-ff6b-ea1793a088d9"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the stories are excellent\n"
     ]
    }
   ],
   "source": [
    "print(output)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "IWoNJrZfy94n"
   },
   "source": [
    "## Cite the paper\n",
    "```\n",
    "@article{marivate2019improving,\n",
    "  title={Improving short text classification through global augmentation methods},\n",
    "  author={Marivate, Vukosi and Sefara, Tshephisho},\n",
    "  journal={arXiv preprint arXiv:1907.03752},\n",
    "  year={2019}\n",
    "}```\n",
    "\n",
    "https://arxiv.org/abs/1907.03752"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "word2vec example.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
